library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Data from the speech features
pd_speech_features <- as.data.frame(read_excel("~/GitHub/FCA/Data/pd_speech_features.xlsx",sheet = "pd_speech_features", range = "A2:ACB758"))
Each subject had three repeated observations. Here I’ll use the average of the three experiments per subject.
rep1Parkison <- subset(pd_speech_features,RID==1)
rownames(rep1Parkison) <- rep1Parkison$id
rep1Parkison$id <- NULL
rep1Parkison$RID <- NULL
rep1Parkison[,1:ncol(rep1Parkison)] <- sapply(rep1Parkison,as.numeric)
rep2Parkison <- subset(pd_speech_features,RID==2)
rownames(rep2Parkison) <- rep2Parkison$id
rep2Parkison$id <- NULL
rep2Parkison$RID <- NULL
rep2Parkison[,1:ncol(rep2Parkison)] <- sapply(rep2Parkison,as.numeric)
rep3Parkison <- subset(pd_speech_features,RID==3)
rownames(rep3Parkison) <- rep3Parkison$id
rep3Parkison$id <- NULL
rep3Parkison$RID <- NULL
rep3Parkison[,1:ncol(rep3Parkison)] <- sapply(rep3Parkison,as.numeric)
whof <- !(colnames(rep1Parkison) %in% c("gender","class"));
avgParkison <- rep1Parkison;
avgParkison[,whof] <- (rep1Parkison[,whof] + rep2Parkison[,whof] + rep3Parkison[,whof])/3
signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(avgParkison) %in% c("gender","class"));
avgParkison[,whof] <- signedlog(avgParkison[,whof])
studyName <- "Parkinsons"
dataframe <- avgParkison
outcome <- "class"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 252 | 753 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 64 | 188 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1000
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9999953
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 681 , Uni p: 0.01697637 , Uncorrelated Base: 183 , Outcome-Driven Size: 0 , Base Size: 183
#>
#>
1 <R=1.000,w= 1,N= 358>, Top: 85( 42 )[ 1 : 85 : 0.975 ]( 83 , 229 , 0 ),<|>Tot Used: 312 , Added: 229 , Zero Std: 0 , Max Cor: 1.000
#>
2 <R=1.000,w= 1,N= 358>, Top: 21( 7 )[ 1 : 21 : 0.975 ]( 21 , 45 , 83 ),<|>Tot Used: 345 , Added: 45 , Zero Std: 0 , Max Cor: 0.996
#>
3 <R=0.996,w= 1,N= 358>, Top: 11( 4 )[ 1 : 11 : 0.973 ]( 11 , 14 , 104 ),<|>Tot Used: 357 , Added: 14 , Zero Std: 0 , Max Cor: 0.973
#>
4 <R=0.973,w= 2,N= 210>, Top: 72( 2 )[ 1 : 72 : 0.937 ]( 71 , 104 , 109 ),<|>Tot Used: 436 , Added: 104 , Zero Std: 0 , Max Cor: 0.968
#>
5 <R=0.968,w= 2,N= 210>, Top: 13( 2 )[ 1 : 13 : 0.934 ]( 13 , 19 , 153 ),<|>Tot Used: 446 , Added: 19 , Zero Std: 0 , Max Cor: 0.974
#>
6 <R=0.974,w= 3,N= 160>, Top: 56( 1 )[ 1 : 56 : 0.887 ]( 55 , 82 , 159 ),<|>Tot Used: 484 , Added: 82 , Zero Std: 0 , Max Cor: 0.964
#>
7 <R=0.964,w= 3,N= 160>, Top: 14( 1 )[ 1 : 14 : 0.882 ]( 14 , 14 , 189 ),<|>Tot Used: 485 , Added: 14 , Zero Std: 0 , Max Cor: 0.882
#>
8 <R=0.882,w= 4,N= 191>, Top: 70( 3 )=( 1 )[ 2 : 70 : 0.813 ]( 68 , 94 , 198 ),<|>Tot Used: 521 , Added: 94 , Zero Std: 0 , Max Cor: 0.964
#>
9 <R=0.964,w= 4,N= 191>, Top: 9( 1 )[ 1 : 9 : 0.832 ]( 9 , 9 , 235 ),<|>Tot Used: 524 , Added: 9 , Zero Std: 0 , Max Cor: 0.829
#>
10 <R=0.829,w= 5,N= 12>, Top: 5( 2 )[ 1 : 5 : 0.800 ]( 5 , 7 , 239 ),<|>Tot Used: 525 , Added: 7 , Zero Std: 0 , Max Cor: 0.800
#>
11 <R=0.000,w= 5,N= 12>
#>
[ 11 ], 0.7995728 Decor Dimension: 525 . Cor to Base: 253 , ABase: 35 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
718
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
310
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.83
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
3.49
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.7995728
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : std_MFCC_2nd_coef 200 : app_entropy_log_3_coef 300 :
app_LT_TKEO_mean_7_coef 400 : tqwt_entropy_log_dec_15 500 :
tqwt_medianValue_dec_7
600 : tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : La_std_MFCC_2nd_coef 200 : La_app_entropy_log_3_coef 300 :
La_app_LT_TKEO_mean_7_coef 400 : La_tqwt_entropy_log_dec_15 500 :
tqwt_medianValue_dec_7
600 : La_tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| std_delta_delta_log_energy | 0.251 | 0.825 | -0.736 | 0.804 | 0.483 | 0.798 |
| std_delta_log_energy | 0.250 | 0.841 | -0.690 | 0.787 | 0.469 | 0.794 |
| std_9th_delta_delta | 0.347 | 0.952 | -0.611 | 0.674 | 0.766 | 0.787 |
| std_8th_delta_delta | 0.319 | 0.941 | -0.598 | 0.595 | 0.862 | 0.780 |
| std_7th_delta_delta | 0.324 | 0.905 | -0.558 | 0.647 | 0.977 | 0.776 |
| tqwt_entropy_log_dec_12 | -0.147 | 0.876 | 0.764 | 0.876 | 0.676 | 0.770 |
| std_6th_delta_delta | 0.311 | 0.851 | -0.470 | 0.540 | 0.896 | 0.768 |
| std_8th_delta | 0.310 | 0.950 | -0.587 | 0.637 | 0.971 | 0.767 |
| std_9th_delta | 0.306 | 0.885 | -0.519 | 0.660 | 0.330 | 0.764 |
| tqwt_entropy_shannon_dec_12 | -0.282 | 0.940 | 0.593 | 0.833 | 0.145 | 0.763 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| std_delta_log_energy | 0.2502 | 0.841 | -0.690 | 0.787 | 4.69e-01 | 0.794 |
| std_9th_delta | 0.3061 | 0.885 | -0.519 | 0.660 | 3.30e-01 | 0.764 |
| mean_MFCC_2nd_coef | -0.3598 | 1.433 | -1.933 | 1.997 | 2.87e-06 | 0.753 |
| La_tqwt_energy_dec_33 | -0.0777 | 0.295 | 0.230 | 0.420 | 8.62e-01 | 0.749 |
| tqwt_entropy_shannon_dec_11 | -0.3083 | 0.958 | 0.504 | 0.884 | 5.34e-01 | 0.744 |
| La_std_3rd_delta | 0.1244 | 0.356 | -0.216 | 0.488 | 9.37e-02 | 0.736 |
| La_tqwt_entropy_shannon_dec_17 | -0.1131 | 0.508 | 0.135 | 0.175 | 6.94e-01 | 0.734 |
| tqwt_kurtosisValue_dec_36 | 0.1302 | 0.643 | -0.408 | 0.545 | 1.62e-02 | 0.733 |
| La_tqwt_kurtosisValue_dec_33 | 0.0505 | 0.377 | -0.301 | 0.511 | 1.54e-01 | 0.732 |
| La_apq11Shimmer | 0.0392 | 0.289 | -0.173 | 0.242 | 3.07e-01 | 0.732 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.46 | 466 | 0.626 |
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| std_delta_log_energy | 0.2502 | 0.841 | -0.6902 | 0.787 | 4.69e-01 | 0.794 | 0.794 | 2 | |
| std_9th_delta | 0.3061 | 0.885 | -0.5192 | 0.660 | 3.30e-01 | 0.764 | 0.764 | 3 | |
| mean_MFCC_2nd_coef | -0.3598 | 1.433 | -1.9334 | 1.997 | 2.87e-06 | 0.753 | 0.753 | NA | |
| La_tqwt_energy_dec_33 | -1.046tqwt_energy_dec_31 + 1.000tqwt_energy_dec_33 | -0.0777 | 0.295 | 0.2303 | 0.420 | 8.62e-01 | 0.749 | 0.509 | 1 |
| tqwt_entropy_shannon_dec_11 | -0.3083 | 0.958 | 0.5035 | 0.884 | 5.34e-01 | 0.744 | 0.744 | 7 | |
| La_std_3rd_delta | -0.971std_MFCC_3rd_coef + 1.000std_3rd_delta | 0.1244 | 0.356 | -0.2163 | 0.488 | 9.37e-02 | 0.736 | 0.645 | 0 |
| La_tqwt_entropy_shannon_dec_17 | + 1.000tqwt_entropy_shannon_dec_17 + 0.990tqwt_minValue_dec_17 | -0.1131 | 0.508 | 0.1349 | 0.175 | 6.94e-01 | 0.734 | 0.709 | -1 |
| tqwt_kurtosisValue_dec_36 | 0.1302 | 0.643 | -0.4081 | 0.545 | 1.62e-02 | 0.733 | 0.733 | NA | |
| La_tqwt_kurtosisValue_dec_33 | -0.809tqwt_kurtosisValue_dec_31 + 1.000tqwt_kurtosisValue_dec_33 | 0.0505 | 0.377 | -0.3010 | 0.511 | 1.54e-01 | 0.732 | 0.628 | -1 |
| La_apq11Shimmer | -0.945locShimmer + 1.000apq11Shimmer | 0.0392 | 0.289 | -0.1725 | 0.242 | 3.07e-01 | 0.732 | 0.713 | -1 |
| apq11Shimmer | NA | 0.1867 | 0.824 | -0.5312 | 0.974 | 5.54e-01 | 0.713 | 0.713 | NA |
| tqwt_entropy_shannon_dec_17 | NA | -0.5108 | 1.187 | 0.2570 | 0.652 | 7.06e-03 | 0.709 | 0.709 | NA |
| locShimmer | NA | 0.1561 | 0.851 | -0.3796 | 1.006 | 9.39e-01 | 0.663 | 0.663 | 5 |
| std_3rd_delta | NA | 0.3138 | 1.025 | -0.2539 | 0.704 | 8.27e-01 | 0.645 | 0.645 | NA |
| tqwt_minValue_dec_17 | NA | 0.4016 | 1.110 | -0.1232 | 0.652 | 2.66e-02 | 0.636 | 0.636 | 12 |
| tqwt_kurtosisValue_dec_33 | NA | 0.2156 | 0.824 | -0.1164 | 0.756 | 5.93e-02 | 0.628 | 0.628 | NA |
| tqwt_energy_dec_31 | NA | 0.0548 | 0.802 | -0.1159 | 0.976 | 2.81e-01 | 0.580 | 0.580 | 3 |
| std_MFCC_3rd_coef | NA | 0.1950 | 0.969 | -0.0388 | 0.722 | 8.12e-01 | 0.552 | 0.552 | 1 |
| tqwt_energy_dec_33 | NA | -0.0204 | 0.897 | 0.1091 | 1.123 | 2.42e-01 | 0.509 | 0.509 | NA |
| tqwt_kurtosisValue_dec_31 | NA | 0.2041 | 0.837 | 0.2282 | 0.904 | 1.27e-01 | 0.490 | 0.490 | 3 |